Installing/ loading libraries

if(!require("quanteda")) {install.packages("quanteda"); library("quanteda")}
if(!require("lubridate")) {install.packages("readtext"); library("readtext")}
if(!require("tidyverse")) {install.packages("tidyverse"); library("tidyverse")}
if(!require("pdftools")) {install.packages("pdftools"); library("pdftools")}

theme_set(theme_light())
library(dplyr)
library(haven)
## Warning: Paket 'haven' wurde unter R Version 4.1.3 erstellt
library(parameters)
## Warning: Paket 'parameters' wurde unter R Version 4.1.3 erstellt
library(performance)
## Warning: Paket 'performance' wurde unter R Version 4.1.3 erstellt
library(see)
## Warning: Paket 'see' wurde unter R Version 4.1.3 erstellt

Data Wrangling of Data set: 1/3 submission_firstround

Task A. Extract text from pdfs in the zip-folder of Consultation Round 1/3

## here() starts at C:/Users/batzdova/Desktop/EC-Web-Scrapping-and-Text-Mining
## [1] 435
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
## PDF error: Invalid Font Weight
#extract ids (of length of 7 characters) for the docs to match with metadata later
#N= 435 docs
#Problem: multiple docs for same id (= multiple docs by same submitter)

tb_pdf$Document <- str_remove(tb_pdf$Document, "C:/Users/batzdova/Desktop/EC-Web-Scrapping-and-Text-Mining/Data/Public_consultation_2020/files/")
ids <-substr(tb_pdf$Document, 1,7)

tb_pdf$id <- ids
library(readr)
Public_consultation_2020 <- read_delim("./Data/Public_consultation_2020/files/Public_consultation_2020.csv", 
                                       
    delim = ";", escape_double = FALSE, trim_ws = TRUE)
## New names:
## Rows: 1216 Columns: 73
## -- Column specification
## -------------------------------------------------------- Delimiter: ";" chr
## (73): Reference, Feedback date, Language, User type, First name, Surname...
## i Use `spec()` to retrieve the full column specification for this data. i
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## * `Other, please specify: ` -> `Other, please specify: ...47`
## * `Other, please specify: ` -> `Other, please specify: ...49`
consult_meta <- as_tibble(Public_consultation_2020)
temp <- left_join(consult_meta, tb_pdf, by = c("Reference" = "id")) %>% as_tibble()
temp %>%  filter(is.na(text)) #observations without pdf text

Recoding the survey (First submission round)

#drop variables
#var column nr. 73: temp[,73]
temp <- temp %>% 
  select(! `You can upload a document here:\n\n` ) %>% 
  select(! `Publication privacy settings` )

#renaming variables

temp <- temp %>% 
  rename(filename = Document,
         country = Country,
         org = `Organisation name`,
         id = Reference,
         time =  `Feedback date`,
         lang = Language,
         type = `User type`,
         firstname = `First name`,
         surname = Surname,
         scope = Scope,
         register = `Transparency register number`,
         size = `Organisation size`) %>% 
  rename_with (~ 'coop_member_states', matches('Working with Member states')) %>% 
  rename_with (~ 'research_innov', matches('Focussing the efforts of the research and innovation community')) %>% 
  rename_with (~ 'skills', matches('\n: Skills')) %>% 
  rename_with (~ 'SME', matches('\n: Focus on SMEs')) %>% 
  rename_with (~ 'private_sector', matches('\n: Partnership with the private sector')) %>%   
  rename_with (~ 'public_sector', matches('\n: Promoting the adoption of AI by the public sector')) %>% 
  rename_with (~ 'other_action', matches('other actions that should be considered?')) %>% 
  rename_with (~ 'excel_research', matches('\n: Strengthen excellence in research')) %>% 
  rename_with (~ 'testing_fac', matches('Establish world-reference testing facilities for AI')) %>%  
  rename_with (~ 'uptake_ai', matches('Promote the uptake of AI by business and the public sector')) %>%        
  rename_with (~ 'startup_finance', matches('Increase the financing for start-ups innovating in AI')) %>% 
  rename_with (~ 'training_skills', matches('Develop skills for AI and adapt existing training programmes')) %>% 
  rename_with (~ 'eu_data_space', matches('Build up the European data space'))  %>% 
  rename_with (~ 'other_area', matches('Are there other areas that that should be considered')) %>%
  rename_with (~ 'lighthouse', matches('Support the establishment of a lighthouse research centre that is world class and able to attract the best minds')) %>% 
  rename_with (~ 'net_centres', matches('Network of existing AI research excellence centres')) %>% 
  rename_with (~ 'partner_research', matches('Set up a public-private partnership for industrial research'))  %>%   rename_with (~ 'action_research', matches('actions to strengthen the research and innovation community that should be given a priority')) %>% 
  rename_with (~ 'benefits_ai', matches('Help to raise SME’s awareness about potential benefits of AI')) %>%   
  rename_with (~ 'access_testing', matches('Provide access to testing and reference facilities')) %>%   
  rename_with (~ 'knowhow_transfer', matches('Promote knowledge transfer and support the development of AI expertise for SMEs')) %>%  
  rename_with (~ 'partner_aiproject', matches('Support partnerships between SMEs, larger enterprises and academia around AI projects')) %>%  
  rename_with (~ 'equity_finance', matches('Provide information about equity financing for AI startups')) %>%  
  rename_with (~ 'tasks_innovhub', matches('important for specialised Digital Innovations Hubs')) %>%
  rename_with (~ 'concern_safety', matches('AI may endanger safety')) %>% 
  rename_with (~ 'concern_rights', matches('AI may breach fundamental rights'))  %>% 
  rename_with (~ 'concern_safety', matches('AI may endanger safety')) %>% 
  rename_with (~ 'concern_discrim', matches('The use of AI may lead to discriminatory outcomes')) %>% 
  rename_with (~ 'concern_explain', matches('AI may take actions for which the rationale cannot be explained')) %>% 
  rename_with (~ 'concern_compensat', matches('AI may make it more difficult for persons having suffered harm to obtain compensation'))  %>% 
  rename_with (~ 'concern_accuracy', matches('AI is not always accurate')) %>%   
  rename_with (~ 'concern_other', matches('Do you have any other concerns about AI that are not mentioned')) %>%
  rename_with (~ 'leg_rules', matches('Do you think that the concerns expressed above can be addressed by applicable EU legislation')) 
names(temp)[44]<- "rules_other"
names(temp)[45]<- "rules_highrisk"
names(temp)[46]<- "mitigate_other"
names(temp)[47]<- "highrisk_approach"
names(temp)[48]<- "highrisk_other"
names(temp)[49]<- "highrisk_app"
names(temp)[50]<- "requir_qual_training_data"
names(temp)[51]<- "requir_record_data"
names(temp)[52]<- "requir_purpose"
names(temp)[53]<- "requir_robust_acc"
names(temp)[54]<- "requir_human_oversight"
names(temp)[55]<- "requir_liability"
names(temp)[56]<- "requir_biometric"

names(temp)[57]<- "requir_spec"
names(temp)[58]<- "label_aisystem"
names(temp)[59]<- "label_suggest"
names(temp)[60]<- "trust_spec"
names(temp)[61]<- "trust_enforce"
names(temp)[62]<- "compliance_spec"
names(temp)[63]<- "risk_spec"
names(temp)[64]<- "risk_reform"
names(temp)[65]<- "reform_assess"
names(temp)[65]<- "risk_procedure"
names(temp)[66]<- "risk_other"
names(temp)[67]<- "liability_reform"
names(temp)[68]<- "liabilty_further"
names(temp)[69]<- "liability_national"
names(temp)[70]<- "liabilty_app"
names(temp)[71]<- "liabilty_other"
#cooperation member states (Likert scale 1-5 (not important - very important))
temp <- temp %>% 
  mutate(coop_member_states = case_when(
    coop_member_states == "5 - Very important" ~ 5,
    coop_member_states == "4 - Important" ~ 4,
    coop_member_states == "3 - Neutral" ~ 3,
    coop_member_states == "2 - Not important" ~ 2,
    coop_member_states == "1 - Not important at all" ~ 1,
    coop_member_states == "No opinion" ~ 0)
    )
c(summary(temp$coop_member_states)[c("Min.", "Max.", "Mean")], "sd" = sd(temp$coop_member_states, na.rm = FALSE)) %>% round(digits = 2)
## Min. Max. Mean   sd 
## 0.00 5.00 4.28   NA
hist(temp$coop_member_states, breaks = 60)

# recoding research innovation focus (research_innov)
temp <- temp %>% 
  mutate(research_innov = case_when(
    research_innov == "5 - Very important" ~ 5,
    research_innov == "4 - Important" ~ 4,
    research_innov == "3 - Neutral" ~ 3,
    research_innov == "2 - Not important" ~ 2,
    research_innov == "1 - Not important at all" ~ 1,
    research_innov == "No opinion" ~ 0)
    )
#recoding skill (skills)
temp <- temp %>% 
  mutate(skills = case_when(
    skills == "5 - Very important" ~ 5,
    skills == "4 - Important" ~ 4,
    skills == "3 - Neutral" ~ 3,
    skills == "2 - Not important" ~ 2,
    skills == "1 - Not important at all" ~ 1,
    skills == "No opinion" ~ 0)
    )
#recoding SME (SME)
temp <- temp %>% 
  mutate(SME = case_when(
    SME == "5 - Very important" ~ 5,
    SME == "4 - Important" ~ 4,
    SME == "3 - Neutral" ~ 3,
    SME == "2 - Not important" ~ 2,
    SME == "1 - Not important at all" ~ 1,
    SME == "No opinion" ~ 0)
    )
#partnership w. private sector (private_sector)
temp <- temp %>% 
  mutate(private_sector = case_when(
    private_sector == "5 - Very important" ~ 5,
    private_sector == "4 - Important" ~ 4,
    private_sector == "3 - Neutral" ~ 3,
    private_sector == "2 - Not important" ~ 2,
    private_sector == "1 - Not important at all" ~ 1,
    private_sector == "No opinion" ~ 0)
    )
#partnership w. public_sector (public_sector)
temp <- temp %>% 
  mutate(public_sector = case_when(
    public_sector == "5 - Very important" ~ 5,
    public_sector == "4 - Important" ~ 4,
    public_sector == "3 - Neutral" ~ 3,
    public_sector == "2 - Not important" ~ 2,
    public_sector == "1 - Not important at all" ~ 1,
    public_sector == "No opinion" ~ 0)
    )
#Strengthen excellence in research (excel_research)
temp <- temp %>% 
  mutate(excel_research = case_when(
    excel_research == "5 - Very important" ~ 5,
    excel_research == "4 - Important" ~ 4,
    excel_research == "3 - Neutral" ~ 3,
    excel_research == "2 - Not important" ~ 2,
    excel_research == "1 - Not important at all" ~ 1,
    excel_research == "No opinion" ~ 0)
    )
#Establish world-reference testing facilities for AI (testing_fac)
temp <- temp %>% 
  mutate(testing_fac = case_when(
    testing_fac == "5 - Very important" ~ 5,
    testing_fac == "4 - Important" ~ 4,
    testing_fac == "3 - Neutral" ~ 3,
    testing_fac == "2 - Not important" ~ 2,
    testing_fac == "1 - Not important at all" ~ 1,
    testing_fac == "No opinion" ~ 0)
    )
#Promote the uptake of AI by business and the public sector (uptake_ai)
temp <- temp %>% 
  mutate(uptake_ai = case_when(
    uptake_ai == "5 - Very important" ~ 5,
    uptake_ai == "4 - Important" ~ 4,
    uptake_ai == "3 - Neutral" ~ 3,
    uptake_ai == "2 - Not important" ~ 2,
    uptake_ai == "1 - Not important at all" ~ 1,
    uptake_ai == "No opinion" ~ 0)
    )
#Increase the financing for start-ups innovating in AI (startup_finance)
temp <- temp %>% 
  mutate(startup_finance = case_when(
    startup_finance == "5 - Very important" ~ 5,
    startup_finance == "4 - Important" ~ 4,
    startup_finance == "3 - Neutral" ~ 3,
    startup_finance == "2 - Not important" ~ 2,
    startup_finance == "1 - Not important at all" ~ 1,
    startup_finance == "No opinion" ~ 0)
    )
#Develop skills for AI and adapt existing training programmes (training_skills)
temp <- temp %>% 
  mutate(training_skills = case_when(
    training_skills == "5 - Very important" ~ 5,
    training_skills == "4 - Important" ~ 4,
    training_skills == "3 - Neutral" ~ 3,
    training_skills == "2 - Not important" ~ 2,
    training_skills == "1 - Not important at all" ~ 1,
    training_skills == "No opinion" ~ 0)
    )
#Build up the European data space (eu_data_space)
temp <- temp %>% 
  mutate(eu_data_space = case_when(
    eu_data_space == "5 - Very important" ~ 5,
    eu_data_space == "4 - Important" ~ 4,
    eu_data_space == "3 - Neutral" ~ 3,
    eu_data_space == "2 - Not important" ~ 2,
    eu_data_space == "1 - Not important at all" ~ 1,
    eu_data_space == "No opinion" ~ 0)
    )
#establishment of a lighthouse research centre (lighthouse)
temp <- temp %>% 
  mutate(lighthouse = case_when(
    lighthouse == "5 - Very important" ~ 5,
    lighthouse == "4 - Important" ~ 4,
    lighthouse == "3 - Neutral" ~ 3,
    lighthouse == "2 - Not important" ~ 2,
    lighthouse == "1 - Not important at all" ~ 1,
    lighthouse == "No opinion" ~ 0)
    )
#Network of existing AI research excellence centres (net_centres)
temp <- temp %>% 
  mutate(net_centres = case_when(
    net_centres == "5 - Very important" ~ 5,
    net_centres == "4 - Important" ~ 4,
    net_centres == "3 - Neutral" ~ 3,
    net_centres == "2 - Not important" ~ 2,
    net_centres == "1 - Not important at all" ~ 1,
    net_centres == "No opinion" ~ 0)
    )
#Set up a public-private partnership for industrial research (partner_research)
temp <- temp %>% 
  mutate(partner_research = case_when(
    partner_research == "5 - Very important" ~ 5,
    partner_research == "4 - Important" ~ 4,
    partner_research == "3 - Neutral" ~ 3,
    partner_research == "2 - Not important" ~ 2,
    partner_research == "1 - Not important at all" ~ 1,
    partner_research == "No opinion" ~ 0)
    )
#SMEs awareness about potential benefits of AI (benefits_ai)
temp <- temp %>% 
  mutate(benefits_ai = case_when(
    benefits_ai == "5 - Very important" ~ 5,
    benefits_ai == "4 - Important" ~ 4,
    benefits_ai == "3 - Neutral" ~ 3,
    benefits_ai == "2 - Not important" ~ 2,
    benefits_ai == "1 - Not important at all" ~ 1,
    benefits_ai == "No opinion" ~ 0)
    )
#Provide access to testing and reference facilities(access_testing)
temp <- temp %>% 
  mutate(access_testing = case_when(
    access_testing == "5 - Very important" ~ 5,
    access_testing == "4 - Important" ~ 4,
    access_testing == "3 - Neutral" ~ 3,
    access_testing == "2 - Not important" ~ 2,
    access_testing == "1 - Not important at all" ~ 1,
    access_testing == "No opinion" ~ 0)
    )
#Promote knowledge transfer and support the development of AI expertise for SMEs(knowhow_transfer)
temp <- temp %>% 
  mutate(knowhow_transfer = case_when(
    knowhow_transfer == "5 - Very important" ~ 5,
    knowhow_transfer == "4 - Important" ~ 4,
    knowhow_transfer== "3 - Neutral" ~ 3,
    knowhow_transfer == "2 - Not important" ~ 2,
    knowhow_transfer == "1 - Not important at all" ~ 1,
    knowhow_transfer == "No opinion" ~ 0)
    )
#partnerships between SMEs, larger enterprises and academia around AI projects(partner_aiproject)
temp <- temp %>% 
  mutate(partner_aiproject = case_when(
    partner_aiproject == "5 - Very important" ~ 5,
    partner_aiproject == "4 - Important" ~ 4,
    partner_aiproject == "3 - Neutral" ~ 3,
    partner_aiproject == "2 - Not important" ~ 2,
    partner_aiproject == "1 - Not important at all" ~ 1,
    partner_aiproject == "No opinion" ~ 0)
    )
#information about equity financing for AI startups(equity_finance)
temp <- temp %>% 
  mutate(equity_finance = case_when(
    equity_finance == "5 - Very important" ~ 5,
    equity_finance == "4 - Important" ~ 4,
    equity_finance == "3 - Neutral" ~ 3,
    equity_finance == "2 - Not important" ~ 2,
    equity_finance == "1 - Not important at all" ~ 1,
    equity_finance == "No opinion" ~ 0)
    )

Merged data frame of first consulation round

tidy_df1 <-temp %>% unite("person", firstname:surname, sep = " ")

#add column indicative for first consultation round
tidy_df1 <- tidy_df1 %>% 
  mutate(consult_round = "one")
tidy_df1 <- tidy_df1 %>%  mutate(type = recode(type, #old value = new value
                    `NGO (Non-governmental organisation)` = "Non-governmental organisation (NGO)",
                    `Academic/Research Institution` = "Academic/research Institution",
                    `EU Citizen` = "EU citizen" ,
                    `Company/Business organisation` = "Company/business organisation",
                    `Consumer Organisation` = "Consumer organisation",
                    `Trade Union` = "Trade union",
                    `Business Association` = "Business association"
                    ))
tidy_df1 <- tidy_df1 %>%  mutate(size = recode(size, #old value = new value
                    `Medium (< 250 employees)` = "Medium (50 to 249 employees)",
                    `Small (< 50 employees)` = "Small (10 to 49 employees)",
                    `Micro (< 10 employees)` = "Micro (1 to 9 employees)"))

Second round of Consultations roadmap_2020 and final round comission_adoption_2021

library(readr)
commission_adoption_2021 <- read_csv("./Augmented_data/commission_adoption_2021.csv")
## Rows: 304 Columns: 11
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (11): Feedback reference, Submitted on, Submitted by, User type, Organis...
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
roadmap_2020 <- read_csv("./Augmented_data/roadmap_2020.csv")
## Rows: 123 Columns: 11
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (11): Feedback reference, Submitted on, Submitted by, User type, Organis...
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
#alternative library(janitor) roadmap %>% clean_names()


scrap20 <- roadmap_2020 %>% 
  rename(country = `Country of origin`,
         id =  `Feedback reference`,
         time = `Submitted on`  ,
         person = `Submitted by` ,
         type = `User type` ,
         org = Organisation,
         size = `Organisation size` ,
         register = `Transparency register number`,
         initiative = Initiative,
         abstract = Paragraph,
         text = pdf)  %>%
  mutate(time = dmy(time)) %>% 
 mutate(type = recode(type, #old value = new value
                    `NRO (Nichtregierungsorganisation)` = "Non-governmental organisation (NGO)",
                    `Universität/Forschungseinrichtung` = "Academic/research Institution",
                    `EU-Bürger/-in` = "EU citizen" ,
                    `Sonstiges` = "Other",
                    `Unternehmen/Unternehmensverband` = "Company/business organisation",
                    `Verbraucherverband` = "Consumer organisation",
                    `Behörde` = "Public authority",
                    `Gewerkschaft` = "Trade union",
                    `Wirtschaftsverband` = "Business association",
                    `-` = "Missing"
                    )) %>% 
   mutate(size = recode(size, #old value = new value
                    `mittel (50 bis 249 Beschäftigte)` = "Medium (50 to 249 employees)",
                    `klein (10 bis 49 Beschäftigte)` = "Small (10 to 49 employees)",
                    `groß (250 oder mehr Beschäftigte)` = "Large (250 or more)",
                    `-` = "Missing",
                    `sehr klein (1 bis 9 Beschäftigte)` = "Micro (1 to 9 employees)")) %>% #I need to find this workaround, the above procedure did not function
mutate(size = case_when(str_detect(size, "mittel") ~ "Medium (50 to 249 employees)", TRUE ~ size)) %>% 
mutate(size = case_when(str_detect(size, "klein") ~ "Small (10 to 49 employees)", TRUE ~ size)) %>%   
mutate(size = case_when(str_detect(size, "sehr") ~ "Micro (1 to 9 employees)", TRUE ~ size)) %>%     
mutate(country = recode(country,
                          `Vereinigten Staaten` = "United States",
                          `Belgien` = "Belgium",
                          `Slowakei` = "Slovakia",
                          `Italien` = "Italy",
                          `Niederlande` = "Netherlands",
                          `Dänemark` = "Denmark",
                          `Vereinigtes Königreich` = "United Kingdom",
                          `Frankreich` = "France",
                          `-` = "Missing",
                          `international` = "Other",
                          `Spanien` = "Spain",
                          `Österreich` = "Austria",
                          `Schweden` = "Sweden",
                          `Polen` = "Poland",
                          `Irland` = "Ireland",
                          `Finnland` = "Finland",
                          `Deutschland` = "Germany",
                          `Ungarn` = "Hungary",
                          `Tschechien` = "Czech Republic",
                          `Rumänien` = "Romania",
                          `Bulgarien` = "Bulgaria"))
  


scrap21 <-commission_adoption_2021 %>% 
  rename(country = `Country of origin`,
         id =  `Feedback reference`,
         time = `Submitted on`  ,
         person = `Submitted by` ,
         type = `User type` ,
         org = Organisation,
         size = `Organisation size` ,
         register = `Transparency register number`,
         initiative = Initiative,
         abstract = Paragraph,
         text = pdf) %>%
  mutate(time = dmy(time)) %>% 
  mutate(type = recode(type, 
                       `Ukyo Mori` = "Other",
                       `Johannes Kröhnert` = "Other",
                         `-` = "Missing")) %>% 
   mutate(country = recode(country,
                           `Regional` = "Other",
                           `Local` = "Other",
                           `feedback.usertype.company` = "Other",
                           `feedback.usertype.business_association` = "Other",
                           `National` = "Other")) %>% 
  mutate(size = recode(size,
                       `-` = "Missing"))
  

scrap20 <- scrap20 %>% mutate(consult_round = "two")
scrap21 <- scrap21 %>% mutate(consult_round = "three")
#problem with scrap 20 and the ids: F550611 and F550610 they are doubles (with empty abstract and text section) complete entry is: F550619
# scrap 20 hast 123 rows but should have 133 !
#after filtering : 121 rows

scrap20 <- scrap20 %>% 
  filter (id != "F550611", id != "F550610")
#problem2: missing on all variables
scrap20 %>% filter(is.na(abstract))
scrap20 %>% filter(id == "-")
scrap20 <- scrap20 %>%  filter(id !="-")
#there are n = 85 pdfs in the folder but only n = 69 [text] in the csv
#there are n = 49 entries with only an abstract but no text: filter(!is.na(abstract), is.na(text))

scrap20 %>% filter(!is.na(text))

A merged data frame of two submission rounds

submission <- rbind(scrap20,scrap21)

Merged data frame of all three submission rounds

with different cell and column numbers

they share: id, time, person, type, org, size, register, country, text

tidy_df1$time<- as.Date(tidy_df1$time, "%d.%m.%Y")
tidy_df1 <- tidy_df1 %>% relocate(person, .after = time )
tidy_df1 <- tidy_df1 %>% relocate(type, .after = person )
tidy_df1 <- tidy_df1 %>% relocate(org, .after = type )
tidy_df1 <- tidy_df1 %>% relocate(size, .after = org )
tidy_df1 <- tidy_df1 %>% relocate(register, .after = size )
tidy_df1 <- tidy_df1 %>% relocate(country, .after = register )
tidy_df1 <- tidy_df1 %>% relocate(text, .after = country )
submission <- submission %>% relocate(text, .after = country )

###Merge all 3 data frames together

three_submission <- full_join( tidy_df1, submission, by = c("id", "time", "type", "size", "org", "register", "text", "consult_round", "person", "country"))
saveRDS(three_submission, "three_submission.rds")
reg_linear <- lm(
  coop_member_states ~ research_innov * excel_research+ 
  benefits_ai+ public_sector+ private_sector+ SME+ lighthouse+net_centres+ testing_fac+startup_finance,
  data = three_submission)


model_parameters(reg_linear) %>% 
     plot()

check_normality(reg_linear) %>%
     plot()

model_parameters(reg_linear)
#Interaction
#library(sjPlot)

#plot_model(
#  reg_linear,
#  type = "int"
#)
library(datawizard)
## Warning: Paket 'datawizard' wurde unter R Version 4.1.3 erstellt
three_submission %>% 
  select(coop_member_states,
         skills,
         SME,
         research_innov) %>%
  describe_distribution(iqr = FALSE)
library(stargazer)
## 
## Please cite as:
##  Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
stargazer(reg_linear)
## 
## % Table created by stargazer v.5.2.3 by Marek Hlavac, Social Policy Institute. E-mail: marek.hlavac at gmail.com
## % Date and time: Thu, Mar 23, 2023 - 21:40:29
## \begin{table}[!htbp] \centering 
##   \caption{} 
##   \label{} 
## \begin{tabular}{@{\extracolsep{5pt}}lc} 
## \\[-1.8ex]\hline 
## \hline \\[-1.8ex] 
##  & \multicolumn{1}{c}{\textit{Dependent variable:}} \\ 
## \cline{2-2} 
## \\[-1.8ex] & coop\_member\_states \\ 
## \hline \\[-1.8ex] 
##  research\_innov & 0.502$^{***}$ \\ 
##   & (0.058) \\ 
##   & \\ 
##  excel\_research & 0.446$^{***}$ \\ 
##   & (0.059) \\ 
##   & \\ 
##  benefits\_ai & 0.049$^{*}$ \\ 
##   & (0.025) \\ 
##   & \\ 
##  public\_sector & 0.077$^{***}$ \\ 
##   & (0.028) \\ 
##   & \\ 
##  private\_sector & 0.066$^{**}$ \\ 
##   & (0.029) \\ 
##   & \\ 
##  SME & 0.004 \\ 
##   & (0.028) \\ 
##   & \\ 
##  lighthouse & 0.029 \\ 
##   & (0.027) \\ 
##   & \\ 
##  net\_centres & 0.160$^{***}$ \\ 
##   & (0.034) \\ 
##   & \\ 
##  testing\_fac & 0.010 \\ 
##   & (0.029) \\ 
##   & \\ 
##  startup\_finance & $-$0.017 \\ 
##   & (0.029) \\ 
##   & \\ 
##  research\_innov:excel\_research & $-$0.093$^{***}$ \\ 
##   & (0.014) \\ 
##   & \\ 
##  Constant & 0.487$^{***}$ \\ 
##   & (0.151) \\ 
##   & \\ 
## \hline \\[-1.8ex] 
## Observations & 1,017 \\ 
## R$^{2}$ & 0.441 \\ 
## Adjusted R$^{2}$ & 0.434 \\ 
## Residual Std. Error & 0.885 (df = 1005) \\ 
## F Statistic & 71.963$^{***}$ (df = 11; 1005) \\ 
## \hline 
## \hline \\[-1.8ex] 
## \textit{Note:}  & \multicolumn{1}{r}{$^{*}$p$<$0.1; $^{**}$p$<$0.05; $^{***}$p$<$0.01} \\ 
## \end{tabular} 
## \end{table}
three_submission %>% 
  select(coop_member_states,
         skills,
         SME,
         research_innov,
         country) %>% 
  drop_na() %>% 
  group_by(country) %>% 
  summarize(research_mean = mean(research_innov),
            coop_mean = mean(coop_member_states)) 
library(sjPlot)
## Warning: Paket 'sjPlot' wurde unter R Version 4.1.3 erstellt
## Install package "strengejacke" from GitHub (`devtools::install_github("strengejacke/strengejacke")`) to load all sj-packages at once!
reg_linear %>% 
  plot_model(
    type = "pred", 
    terms = "research_innov"
  )